In [1]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt 
import seaborn as sns
import sys
import os
import statsmodels.api as sm
import math
import plotly.express as px #graphing
import plotly.graph_objects as go #graphing
from plotly.subplots import make_subplots #graphing
from datetime import datetime, timedelta
In [2]:
#!pip install missingno
In [3]:
path = "/Users/ycq/Downloads/Principal/"
df =pd.read_csv(path + "/Quant Exercise.csv")

Exploratory Data Analysis¶

In [4]:
df.head(), df.info(), df.shape
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 551 entries, DP03_0001E to fips
dtypes: float64(300), int64(249), object(2)
memory usage: 292.8+ KB
Out[4]:
(   DP03_0001E  DP03_0001M  DP03_0001PE  DP03_0001PM  DP03_0002E  DP03_0002M  \
 0       84387         137      84387.0          NaN     53245.0         910   
 1     1015608         631    1015608.0          NaN    660484.0        3044   
 2       54444         101      54444.0          NaN     31615.0         519   
 3      136682         202     136682.0          NaN     84153.0        1035   
 4       40064          86      40064.0          NaN     23144.0         447   
 
    DP03_0002PE  DP03_0002PM  DP03_0003E  DP03_0003M  ...  DP03_0136M  \
 0         63.1          1.1       53200         911  ...         NaN   
 1         65.0          0.3      659555        3038  ...         NaN   
 2         58.1          1.0       31593         518  ...         NaN   
 3         61.6          0.7       84051        1040  ...         NaN   
 4         57.8          1.1       23140         447  ...         NaN   
 
    DP03_0136PE  DP03_0136PM  DP03_0137E  DP03_0137M  DP03_0137PE  DP03_0137PM  \
 0          6.2          0.8         NaN         NaN         17.4          1.8   
 1          7.7          0.4         NaN         NaN         21.3          0.7   
 2          8.4          1.1         NaN         NaN         23.8          2.2   
 3          6.8          0.7         NaN         NaN         20.8          1.3   
 4          8.0          1.2         NaN         NaN         25.0          3.2   
 
            GEO_ID     county     fips  
 0  0500000US42001      Adams  42001.0  
 1  0500000US42003  Allegheny  42003.0  
 2  0500000US42005  Armstrong  42005.0  
 3  0500000US42007     Beaver  42007.0  
 4  0500000US42009    Bedford  42009.0  
 
 [5 rows x 551 columns],
 None,
 (68, 551))
In [5]:
df.describe(), df.columns, df.isnull().sum()
Out[5]:
(         DP03_0001E   DP03_0001M   DP03_0001PE  DP03_0001PM    DP03_0002E  \
 count  6.800000e+01    68.000000  6.800000e+01          0.0  6.700000e+01   
 mean   3.075309e+05   263.514706  3.075378e+05          NaN  1.927388e+05   
 std    1.268587e+06   336.222496  1.268588e+06          NaN  8.033616e+05   
 min    3.880000e+03    30.000000  3.880000e+03          NaN  1.493000e+03   
 25%    3.499550e+04    95.750000  3.499550e+04          NaN  1.942900e+04   
 50%    7.116700e+04   157.500000  7.116700e+04          NaN  3.991700e+04   
 75%    1.809575e+05   324.500000  1.809575e+05          NaN  1.047060e+05   
 max    1.045605e+07  2462.000000  1.045605e+07          NaN  6.566126e+06   
 
          DP03_0002M  DP03_0002PE  DP03_0002PM    DP03_0003E    DP03_0003M  \
 count     68.000000    68.000000    68.000000  6.800000e+01     68.000000   
 mean    1165.588235    58.964706     1.079412  1.928849e+05   1165.411765   
 std     1557.778070     6.400088     0.640792  7.963784e+05   1548.812335   
 min      119.000000    21.500000     0.100000  1.493000e+03    119.000000   
 25%      443.250000    56.125000     0.700000  1.961725e+04    442.750000   
 50%      800.500000    59.400000     1.000000  4.101200e+04    801.500000   
 75%     1426.250000    62.850000     1.300000  1.122075e+05   1430.250000   
 max    11721.000000    68.300000     4.400000  6.558087e+06  11636.000000   
 
        ...  DP03_0135PM  DP03_0136E  DP03_0136M  DP03_0136PE  DP03_0136PM  \
 count  ...    68.000000         0.0         0.0    68.000000    68.000000   
 mean   ...     1.447059         NaN         NaN     8.836765     1.242647   
 std    ...     0.749023         NaN         NaN     2.831639     0.684436   
 min    ...     0.200000         NaN         NaN     3.700000     0.200000   
 25%    ...     0.800000         NaN         NaN     6.800000     0.800000   
 50%    ...     1.400000         NaN         NaN     8.500000     1.200000   
 75%    ...     1.700000         NaN         NaN    10.025000     1.500000   
 max    ...     3.800000         NaN         NaN    20.400000     4.200000   
 
        DP03_0137E  DP03_0137M  DP03_0137PE  DP03_0137PM          fips  
 count         0.0         0.0    68.000000    68.000000     67.000000  
 mean          NaN         NaN    23.795588     2.413235  42067.000000  
 std           NaN         NaN     4.925518     1.290251     38.970074  
 min           NaN         NaN    15.300000     0.300000  42001.000000  
 25%           NaN         NaN    20.725000     1.475000  42034.000000  
 50%           NaN         NaN    23.300000     2.200000  42067.000000  
 75%           NaN         NaN    25.725000     2.900000  42100.000000  
 max           NaN         NaN    44.300000     7.000000  42133.000000  
 
 [8 rows x 549 columns],
 Index(['DP03_0001E', 'DP03_0001M', 'DP03_0001PE', 'DP03_0001PM', 'DP03_0002E',
        'DP03_0002M', 'DP03_0002PE', 'DP03_0002PM', 'DP03_0003E', 'DP03_0003M',
        ...
        'DP03_0136M', 'DP03_0136PE', 'DP03_0136PM', 'DP03_0137E', 'DP03_0137M',
        'DP03_0137PE', 'DP03_0137PM', 'GEO_ID', 'county', 'fips'],
       dtype='object', length=551),
 DP03_0001E      0
 DP03_0001M      0
 DP03_0001PE     0
 DP03_0001PM    68
 DP03_0002E      1
                ..
 DP03_0137PE     0
 DP03_0137PM     0
 GEO_ID          0
 county          0
 fips            1
 Length: 551, dtype: int64)
In [6]:
df.dtypes
Out[6]:
DP03_0001E       int64
DP03_0001M       int64
DP03_0001PE    float64
DP03_0001PM    float64
DP03_0002E     float64
                ...   
DP03_0137PE    float64
DP03_0137PM    float64
GEO_ID          object
county          object
fips           float64
Length: 551, dtype: object
Sourced from credit bureau data¶

I renamed the columns that I wanted to use and created a dictionary with the key value pairs for the column header names from the credit bureau website

FIPS is a five-digit Federal Information Processing Standards code which uniquely identifies counties in the United States. I use FIPS along with geojson to create Choropleth Maps.

In [7]:
# Dictionary of all renamed columns
# All of the renamed columns are ESTIMATES from the U.S. Census Bureau
# Columns not renamed include: Percent (PE), Margin of Error (M), Percent Margin of Error (PM)

dict = {# Employment Status
        # Population 16 years and over
        "DP03_0001E" : "total_population", # Total Population elgible for work
        "DP03_0002E" : "labor_force",
        "DP03_0003E" : "civ_labor_force",
        "DP03_0004E" : "total_employed",
        "DP03_0005E" : "total_unemployed",
        "DP03_0006E" : "armed_forces",
        "DP03_0007E" : "not_in_labor_force",
    

        # Females 16 years and over
        "DP03_0010E" : "total_population_female", # Total Population elgible for work
        "DP03_0011E" : "labor_force_female",
        "DP03_0012E" : "civ_labor_force_female",
        "DP03_0013E" : "civ_labor_force_female_employed",
    

        # Households with children
        "DP03_0014E" : "household_children_under_6", # Own children of the householder under 6 years
        # All parents in family in labor force
        "DP03_0015E" : "parents_work_children_under_6", # Own children of the householder under 6 years
        "DP03_0016E" : "household_children_6to17", # Own children of the householder 6 to 17 years
        # All parents in family in labor force
        "DP03_0017E" : "parents_work_children_6to17", # Own children of the householder 6 to 17 years
    

        # Commuting to work
        "DP03_0018E" : "total_workers_commute",
        "DP03_0019E" : "solo_vehicle_commute", # Car, truck, or van -- drove alone
        "DP03_0020E" : "carpool_commute", # Car, truck, or van -- carpooled
        "DP03_0021E" : "public_transportation_commute", # Public transportation (excluding taxicab)
        "DP03_0022E" : "walked_commute",
        "DP03_0023E" : "other_means_commute",
        "DP03_0024E" : "worked_from_home",
        "DP03_0025E" : "mean_commute_time_minutes",
    

        # Occupation
        "DP03_0027E" : "manage_business_sci_art", # Management, business, science, and arts occupations
        "DP03_0028E" : "service_occupations",
        "DP03_0029E" : "sales_and_office_occupations",
        # Natural resources, construction, and maintenance occupations
        "DP03_0030E" : "nr_construction_and_maintenance",
        # Production, transportation, and material moving occupations
        "DP03_0031E" : "production_transportation_mm",
    

        # Industry
        "DP03_0033E" : "ag_forest_fish_hunt_mine", # Agriculture, forestry, fishing and hunting, and mining
        "DP03_0034E" : "construction",
        "DP03_0035E" : "manufacturing",
        "DP03_0036E" : "wholesale_trade",
        "DP03_0037E" : "retail_trade",
        "DP03_0038E" : "transportation_warehousing_utilities",
        "DP03_0039E" : "information",
        "DP03_0040E" : "firerl", # Finance, insurance, real estate, rental and leasing
        # Professional, scientific, and management, and administrative and waste management services
        "DP03_0041E" : "psmawms",
        # Educational services, and health care and social assistance
        "DP03_0042E" : "education_health_care_social",
        # Arts, entertainment, and recreation, and accommodation and food services
        "DP03_0043E" : "art_entertainment_accommodation",
        "DP03_0044E" : "other_services", # Other services, except public administration
        "DP03_0045E" : "public_administration",
    

        # Class of worker
        "DP03_0047E" : "private_wage_and_salary_worker",
        "DP03_0048E" : "government_worker",
        "DP03_0049E" : "self_employed_worker", # Self-employed in own not incorporated business workers
        "DP03_0050E" : "unpaid_family_worker",
    

        # Income and benefits (in 2020 inflation-adjusted dollars)
        # Total households
        "DP03_0051E" : "total_households",
        "DP03_0052E" : "household_less_than_10k",
        "DP03_0053E" : "household_10k_to_15k", # $10,000 to $14,999
        "DP03_0054E" : "household_15k_to_25k", # $15,000 to $24,999
        "DP03_0055E" : "household_25k_to_35k", # $25,000 to $34,999
        "DP03_0056E" : "household_35k_to_50k", # $35,000 to $49,999
        "DP03_0057E" : "household_50k_to_75k", # $50,000 to $74,999
        "DP03_0058E" : "household_75k_to_100k", # $75,000 to $99,999
        "DP03_0059E" : "household_100k_to_150k", # $100,000 to $149,999
        "DP03_0060E" : "household_150k_to_200k", # $150,000 to $199,999
        'DP03_0061E' : "household_200k_plus", # $200,000 or more
        "DP03_0062E" : "household_median_income", # dollars
        "DP03_0063E" : "household_mean_income", # dollars

    
        # Families
        "DP03_0075E" : "total_families",
        "DP03_0076E" : "family_less_than_10k",
        "DP03_0077E" : "family_10k_to_15k", # $10,000 to $14,999
        "DP03_0078E" : "family_15k_to_25k", # $15,000 to $24,999
        "DP03_0079E" : "family_25k_to_35k", # $25,000 to $34,999
        "DP03_0080E" : "family_35k_to_50k", # $35,000 to $49,999
        "DP03_0081E" : "family_50k_to_75k", # $50,000 to $74,999
        "DP03_0082E" : "family_75k_to_100k", # $75,000 to $99,999
        "DP03_0083E" : "family_100k_to_150k", # $100,000 to $149,999
        "DP03_0084E" : "family_150k_to_200k", # $150,000 to $199,999
        "DP03_0085E" : "family_200k_plus", # $200,000 or more
        "DP03_0086E" : "family_median_income", # dollars
        "DP03_0087E" : "family_mean_income", # dollars
        "DP03_0088E" : "per_capita_income",

    
        # Nonfamily Households
        "DP03_0089E" : "total_nonfamily_households",
        "DP03_0090E" : "nonfamily_median_income", # dollars
        "DP03_0091E" : "nonfamily_mean_income", # dollars
    
    
        # Median Earnings
        "DP03_0092E" : "median_earnings_for_workers", # dollars
        "DP03_0093E" : "median_earnings_male_fulltime", # dollars
        "DP03_0094E" : "median_earnings_female_fulltime", # dollars

    
        # Health Insurance Coverage
        "DP03_0095E" : "total_civ_population", # Total Civilian Noninstitutionalized Population
        "DP03_0096E" : "civ_health_insurance_coverage", # Population
        "DP03_0097E" : "civ_private_health_insurance", # Population
        "DP03_0098E" : "civ_public_health_insurance", # Population
        "DP03_0099E" : "civ_no_health_insurance"} # Population

df.rename(columns = dict, inplace = True)
In [8]:
# drop the columns with all missing values
df = df.dropna(axis=1, how="all")
In [9]:
df.head()
Out[9]:
total_population DP03_0001M DP03_0001PE labor_force DP03_0002M DP03_0002PE DP03_0002PM civ_labor_force DP03_0003M DP03_0003PE ... DP03_0134PM DP03_0135PE DP03_0135PM DP03_0136PE DP03_0136PM DP03_0137PE DP03_0137PM GEO_ID county fips
0 84387 137 84387.0 53245.0 910 63.1 1.1 53200 911 63.0 ... 0.8 5.7 0.9 6.2 0.8 17.4 1.8 0500000US42001 Adams 42001.0
1 1015608 631 1015608.0 660484.0 3044 65.0 0.3 659555 3038 64.9 ... 0.3 8.5 0.6 7.7 0.4 21.3 0.7 0500000US42003 Allegheny 42003.0
2 54444 101 54444.0 31615.0 519 58.1 1.0 31593 518 58.0 ... 1.0 9.2 1.6 8.4 1.1 23.8 2.2 0500000US42005 Armstrong 42005.0
3 136682 202 136682.0 84153.0 1035 61.6 0.7 84051 1040 61.5 ... 0.7 7.2 0.9 6.8 0.7 20.8 1.3 0500000US42007 Beaver 42007.0
4 40064 86 40064.0 23144.0 447 57.8 1.1 23140 447 57.8 ... 1.6 9.4 1.6 8.0 1.2 25.0 3.2 0500000US42009 Bedford 42009.0

5 rows × 460 columns

In [10]:
df.tail(), df.county.nunique
Out[10]:
(    total_population  DP03_0001M  DP03_0001PE  labor_force  DP03_0002M  \
 63             43821         104      43821.0      22241.0         767   
 64            294500         356     294500.0     179495.0        1512   
 65             22487          77      22487.0      13226.0         319   
 66            360718         453     360718.0     237353.0        1629   
 67          10456049        2462   10456049.0    6566126.0       11721   
 
     DP03_0002PE  DP03_0002PM  civ_labor_force  DP03_0003M  DP03_0003PE  ...  \
 63         50.8          1.7            22229         766         50.7  ...   
 64         60.9          0.5           179459        1511         60.9  ...   
 65         58.8          1.4            13217         318         58.8  ...   
 66         65.8          0.5           237053        1634         65.7  ...   
 67         62.8          0.1          6558087       11636         62.7  ...   
 
     DP03_0134PM  DP03_0135PE  DP03_0135PM  DP03_0136PE  DP03_0136PM  \
 63          1.7          6.4          1.5          7.4          1.6   
 64          0.5          7.2          0.7          6.4          0.5   
 65          1.4          8.1          1.7          7.8          1.3   
 66          0.5          6.8          0.8          6.3          0.5   
 67          0.1          8.2          0.2          9.0          0.2   
 
     DP03_0137PE  DP03_0137PM          GEO_ID        county     fips  
 63         24.8          2.8  0500000US42127         Wayne  42127.0  
 64         21.4          1.2  0500000US42129  Westmoreland  42129.0  
 65         21.9          2.7  0500000US42131       Wyoming  42131.0  
 66         20.0          1.4  0500000US42133          York  42133.0  
 67         23.3          0.3     0400000US42  Pennsylvania      NaN  
 
 [5 rows x 460 columns],
 <bound method IndexOpsMixin.nunique of 0            Adams
 1        Allegheny
 2        Armstrong
 3           Beaver
 4          Bedford
           ...     
 63           Wayne
 64    Westmoreland
 65         Wyoming
 66            York
 67    Pennsylvania
 Name: county, Length: 68, dtype: object>)
In [11]:
df0 = df[df["county"] == "Pennsylvania"]
df = df[df["county"] != "Pennsylvania"]
In [12]:
print(df.county.unique())
['Adams' 'Allegheny' 'Armstrong' 'Beaver' 'Bedford' 'Berks' 'Blair'
 'Bradford' 'Bucks' 'Butler' 'Cambria' 'Cameron' 'Carbon' 'Centre'
 'Chester' 'Clarion' 'Clearfield' 'Clinton' 'Columbia' 'Crawford'
 'Cumberland' 'Dauphin' 'Delaware' 'Elk' 'Erie' 'Fayette' 'Forest'
 'Franklin' 'Fulton' 'Greene' 'Huntingdon' 'Indiana' 'Jefferson' 'Juniata'
 'Lackawanna' 'Lancaster' 'Lawrence' 'Lebanon' 'Lehigh' 'Luzerne'
 'Lycoming' 'McKean' 'Mercer' 'Mifflin' 'Monroe' 'Montgomery' 'Montour'
 'Northampton' 'Northumberland' 'Perry' 'Philadelphia' 'Pike' 'Potter'
 'Schuylkill' 'Snyder' 'Somerset' 'Sullivan' 'Susquehanna' 'Tioga' 'Union'
 'Venango' 'Warren' 'Washington' 'Wayne' 'Westmoreland' 'Wyoming' 'York']

Above is most of the data cleaning process, with labeling, missing value treatment, data formatting.¶

Data Visulization - a brief overview of the data¶

In [13]:
plt.rcParams["figure.figsize"] = (12, 8)
In [14]:
# Importing county data for Plotly Choropleth Maps
from urllib.request import urlopen
import json
with urlopen("https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json") as response:
    counties = json.load(response)
In [15]:
#Pennsylvania Household Income and Benefits 

plt.style.use('seaborn-dark')
plot = df0[["county", "household_less_than_10k", "household_10k_to_15k", "household_15k_to_25k",
                      "household_25k_to_35k", "household_35k_to_50k", "household_50k_to_75k",
                      "household_75k_to_100k", "household_100k_to_150k", "household_150k_to_200k",
                      "household_200k_plus"]].plot(x = "county", kind = "bar", cmap = "Spectral")

plt.grid(axis = 'y', alpha = 0.3)
plot.set_xticklabels(plot.get_xticklabels(), rotation = 360, fontsize = 20)
plt.title("Distribution of Pennsylvania Total Household Income", fontsize = 25)
plt.legend(bbox_to_anchor = (1.02, 1), loc = 2, borderaxespad = 0, fontsize = 15)
/var/folders/2r/y399hm9d0hv3ysx7v92kl9l80000gn/T/ipykernel_64870/2960315682.py:3: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
  plt.style.use('seaborn-dark')
Out[15]:
<matplotlib.legend.Legend at 0x7fdc100fd840>
In [16]:
# Percentage of Households with less than $50,000 income 

df["household_less_than_50k"] = df["household_less_than_10k"] + df["household_10k_to_15k"] + df["household_15k_to_25k"] + df["household_25k_to_35k"] + df["household_35k_to_50k"]

df["household_less_than_50k_percentage"] = (df["household_less_than_50k"]/df["total_households"]) * 100

fig = px.choropleth_mapbox(df, geojson = counties, locations = "fips", 
                           color = "household_less_than_50k_percentage",
                           color_continuous_scale = "Reds",
                           mapbox_style = "carto-darkmatter",
                           zoom = 6.25, center = {"lat": 41, "lon": -77.65},
                           hover_name = "county",
                           labels = {"household_less_than_50k_percentage": "% Household < $50,000 💰"}
                          )
                                     
fig.update_layout(margin = {"r": 0,"t": 0,"l": 0,"b": 0})
fig.update_layout(template = "plotly_white")
fig.show()
In [17]:
# Percentage of Households with more than $100,000 income 

df["household_100k_plus"] = df["household_100k_to_150k"] + df["household_150k_to_200k"] + df["household_200k_plus"]

df["household_100k_plus_percentage"] = (df["household_100k_plus"]/df["total_households"]) * 100

fig = px.choropleth_mapbox(df, geojson = counties, locations = "fips", 
                           color = "household_100k_plus_percentage",
                           color_continuous_scale = "Greens",
                           mapbox_style = "carto-darkmatter",
                           zoom = 6.25, center = {"lat": 41, "lon": -78},
                           hover_name = "county",
                           labels = {"household_100k_plus_percentage": "% Household > $100,000 💰"}
                          )
                                     
fig.update_layout(margin = {"r": 0,"t": 0,"l": 0,"b": 0})
fig.update_layout(template = "plotly_white")
fig.show()

Pennsylvania Employment Analysis¶

In [18]:
# Total Employment Rate

df["employment_rate"] = (df["total_employed"]/df["labor_force"]) * 100

fig = px.choropleth_mapbox(df, geojson = counties, locations = "fips", 
                           color = "employment_rate",
                           color_continuous_scale = "RdBu_r",
                           range_color = (91.5, 97.5),
                           mapbox_style = "carto-darkmatter",
                           zoom = 6.25, center = {"lat": 41, "lon": -77.65},
                           hover_name = "county",
                           hover_data = ["total_employed", "total_unemployed"],
                           labels = {"employment_rate": "Employment Rate",
                                     "total_employed": "Total Employed",
                                     "total_unemployed": "Total Unemployed"}
                          )

fig.update_layout(margin = {"r": 0,"t": 0,"l": 0,"b": 0})
fig.update_layout(template = "plotly_dark")
fig.show()
In [19]:
# Employment Rate Percentage by County¶


df["labor_force_male"] = df["labor_force"] - df["labor_force_female"]
df["total_male_employed"] = df["total_employed"] - df["civ_labor_force_female_employed"]
df["employment_rate_male"] = (df["total_male_employed"]/df["labor_force_male"]) * 100
df["employment_rate_female"] = (df["civ_labor_force_female_employed"]/df["civ_labor_force_female"]) * 100


plt.style.use("Solarize_Light2")

x1 = df.employment_rate_male
x2 = df.employment_rate_female
x3 = df.employment_rate
y = df.county

plt.figure(figsize = (6, 14), dpi = 80)
plt.scatter(x1, y, color = "#0000FF", edgecolors = "#000000", s = 50, alpha = 0.75, label = "Male Employment Rate %")
plt.scatter(x2, y, color = "#FF00FF", edgecolors = "#000000", s = 50, alpha = 0.75, label = "Female Employment Rate %")
plt.plot(x3, y, color = "#000000", alpha = 0.5, linestyle = "dashed", label = "Total Employment Rate %")
plt.grid(color = "#d3d3d3", linestyle = '-', linewidth = 0.75)
plt.title("Employment Rate % by County")
plt.xlabel("Employment Rate Percentage")
plt.ylabel("")
plt.legend(loc = 2)
plt.show()
In [ ]:
 

Pennsylvania Median Earnings by County¶

In [20]:
#Pennsylvania Median Earnings by County 


plt.style.use("Solarize_Light2")

x1 = df.median_earnings_for_workers
x2 = df.median_earnings_male_fulltime
x3 = df.median_earnings_female_fulltime
y = df.county

plt.figure(figsize = (8, 14), dpi = 80)
plt.scatter(x1, y, color = "#000000", alpha = 1, s = 12, label = "All Workers")
plt.plot(x1, y, color = "#000000", alpha = 0.75)
plt.scatter(x2, y, color = "#0000FF", edgecolors = "#000000", label = "Male Full Time")
plt.plot(x2, y, color = "#0000FF", alpha = 0.75, linestyle = "--")
plt.scatter(x3, y, color = "#FF00FF", edgecolors = "#000000", label = "Female Full Time")
plt.plot(x3, y, color = "#FF00FF", alpha = 0.75, linestyle = "--")
plt.grid(color = "#d3d3d3", linestyle = '-', linewidth = 2)
plt.title("Pennsylvania Median Earnings by County")
plt.xlabel("Median Earnings (dollars)")
plt.ylabel("")
plt.legend(loc = 1)
plt.show()
In [21]:
#Pennsylvania Per Capita Income by County


plt.style.use("seaborn-dark")

x = df.per_capita_income
y = df.county

plt.figure(figsize = (8, 14), dpi = 80)
plt.scatter(x, y, color = "#00DB16", alpha = 1, s = 100, edgecolors = "#d3d3d3", label = "Per Capita Income (USD)")
plt.plot(x, y, color = "#00DB16", linestyle = "dotted")
plt.grid(color = "#d3d3d3", linestyle = '-', linewidth = 0.25)
plt.title("Pennsylvania Per Capita Income by County")
plt.xlabel("Per Capita Income (dollars)")
plt.ylabel("")
plt.legend(loc = 1)
plt.show()
/var/folders/2r/y399hm9d0hv3ysx7v92kl9l80000gn/T/ipykernel_64870/3704097254.py:4: MatplotlibDeprecationWarning:

The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.

How People in Pennsylvania Commute to Work¶

In [22]:
# Percentage Worked From Home 


df["worked_from_home_percentage"] = (df["worked_from_home"]/df["total_workers_commute"]) * 100

fig = px.choropleth_mapbox(df, geojson = counties, locations = "fips", 
                           color = "worked_from_home_percentage",
                           color_continuous_scale = "Viridis",
                           mapbox_style = "carto-darkmatter",
                           zoom = 6.25, center = {"lat": 41, "lon": -77.65},
                           hover_name = "county",
                           labels = {"worked_from_home_percentage": "% Working From Home"}
                          )

fig.update_layout(margin = {"r": 0,"t": 0,"l": 0,"b": 0})
fig.update_layout(template = "plotly_dark")
fig.show()

Pennsylvania Health Insurance Coverage Analysis¶

In [24]:
#civ_health_insurance_coverage_percentage

df["civ_health_insurance_coverage_percentage"] = (df["civ_health_insurance_coverage"]/df["total_civ_population"]) * 100

fig = px.choropleth_mapbox(df, geojson = counties, locations = "fips", 
                           color = "civ_health_insurance_coverage_percentage",
                           color_continuous_scale = "Picnic",
                           mapbox_style = "carto-darkmatter",
                           zoom = 6.25, center = {"lat": 41, "lon": -77.65},
                           hover_name = "county",
                           labels = {"civ_health_insurance_coverage_percentage": "Percentage w/ Health Insurance 🏥"}
                          )
                                     
fig.update_layout(margin = {"r": 0,"t": 0,"l": 0,"b": 0})
fig.update_layout(template = "plotly_dark")
fig.show()
In [ ]:
 
In [ ]:
 
In [46]:
#!pip install geopandas==0.8.1
#!pip install pyshp==1.2.10
#!pip install shapely==1.6.3
In [76]:
 
In [48]:
#import plotly.figure_factory as ff
#fig = ff.create_choropleth(fips=df.fips, 
#                           scope=['PA'],
#                          values=df.total_population, 
#                           title='PA total population by County', 
#                           legend_title='')
#fig.layout.template = None
#fig.show()
In [85]:
#import plotly.figure_factory as ff

#values = range(len(df.fips))

#fig = ff.create_choropleth(fips=df.fips, values=values)
#fig.layout.template = None
#fig.show()
In [ ]:
 
In [ ]:
 
In [ ]: